Correlations

First we calculate a correlation matrix for the tags.

practices_cor <- tags %>%
  select(starts_with("practices")) %>%
  cor

practices_jac <- tags %>%
  select(starts_with("practices")) %>%
  simil(method = "Jaccard", by_rows = FALSE)

plot_tag_cor <- function(cor_mat, title = "") {
  ggcorrplot(cor_mat, hc.order = T, type = "upper") +
    scale_fill_distiller(type = "div", limits = c(-1, 1), expand = c(0, 0)) +
    labs(title = title,
         fill = "Correlation") +
    scale_x_discrete(labels = label_tags()) +
    scale_y_discrete(labels = label_tags()) +
    labs(x = "", y = "") + 
#    scale_fill_cc_gradient + 
    theme_transcend_sparse + 
    theme(axis.text.x = element_blank(), 
          axis.text.y = element_blank(),
          panel.border = element_blank(),
          axis.ticks = element_blank()
          )
}
cor_plot <- plot_tag_cor(practices_cor, 
                         title = "Correlation heat map for all tags") +
            theme(legend.position = c(.85, .25))

ggplotly(cor_plot)
jac_plot <- plot_tag_cor(as.matrix(practices_jac), 
                         title = "Jaccard similarity for all tags") +
  theme(legend.position = c(.85, .25)) + 
  scale_fill_distiller(type = "div", limits = c(0, 1), expand = c(0, 0))

ggplotly(jac_plot)
mat_to_df <- function(m) {
  data.frame(row=rownames(m)[row(m)[upper.tri(m)]], 
           col=colnames(m)[col(m)[upper.tri(m)]], 
           corr=m[upper.tri(m)])
}

d_corr <- mat_to_df(practices_cor)
d_jacc <- mat_to_df(as.matrix(practices_jac))

jacc_corr <- d_jacc %>%
  rename(jaccard = corr) %>%
  left_join(d_corr, by = c("row", "col")) %>%
  rename(correlation = corr) %>%
  mutate(
    jacc_rank = rank(-jaccard),
    corr_rank = rank(-correlation),
    rank_diff = jacc_rank - corr_rank
  ) %>% 
  arrange(desc(jaccard))

jacc_corr %>% 
  datatable(
    caption = "Comparison of Jaccard Similarity and Correlation Coeffficients",
    colnames = c("Jaccard (0,1)" = "jaccard", "Correlation (-1,1)" = "correlation")
  ) %>%
  formatRound(digits = 2, columns = c("Jaccard (0,1)", "Correlation (-1,1)")) 
corr_hist <- ggplot(jacc_corr, aes(x = correlation)) +
  geom_histogram(binwidth = 0.03, fill = transcend_cols[1]
                 ) +
  geom_vline(aes(xintercept = mean(correlation)), 
             color = transcend_cols[3], 
             size = 1) +
  geom_text(aes(x = mean(correlation), y = Inf, 
                label = paste("Average:",round(mean(correlation), 2))), 
            hjust = -.1, check_overlap = TRUE, vjust = 1.1, 
            family = "Open Sans") + 
  bar_y_scale_count +
  scale_x_continuous(limits = c(-1, 1), expand = expansion(0, 0)) +
  labs(title = "Distribution of pairwise tag correlations", 
       y = "Count of Tag Pairs",
       x = "Correlation") +
  theme(plot.margin = margin(t = 8, r = 12, b = 8, l = 8, unit = "pt"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
jacc_hist <- ggplot(jacc_corr, aes(x = jaccard)) +
  geom_histogram(binwidth = 0.03, fill = transcend_cols[2]
                 ) +
  geom_vline(aes(xintercept = mean(jaccard)), 
             color = transcend_cols[3], 
             size = 1) +
  geom_text(aes(x = mean(jaccard), y = Inf, 
                label = paste("Average:", round(mean(jaccard), 2))), 
            hjust = -.1, check_overlap = TRUE, vjust = 1.1, 
            family = "Open Sans") + 
  bar_y_scale_count +
  scale_x_continuous(limits = c(0, 1), expand = expansion(0, 0)) +
  labs(title = "Distribution of pairwise tag similarities", 
       y = "Count of Tag Pairs",
       x = "Jaccard Similarity") +
  theme(plot.margin = margin(t = 8, r = 12, b = 8, l = 8, unit = "pt")) 

corr_hist + jacc_hist
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

Clustering

Scree Plots

fa_cor <- fa.parallel(
  practices_cor, fm = "pa", fa = "fa", n.obs = nrow(tags),
  main = "Correlation scree plot"
)

## Parallel analysis suggests that the number of factors =  10  and the number of components =  NA
jac_mat <- as.matrix(practices_jac)
jac_mat[is.na(jac_mat)] = 1

fa_jac <- fa.parallel(jac_mat, fm = "pa", fa = "fa", 
                      n.obs = nrow(tags),
                      main = "Jaccard scree plot"
)

## Parallel analysis suggests that the number of factors =  3  and the number of components =  NA

Parallel Analysis

# n_i  <- nrow(values_3) # The number of cases in our data
# n_p <- ncol(values_3) # The number of variables in our data
# 
# set.seed(2)   # To reproduce our randomly generated results.
# 
# Eigs <- pca_3$values    # The eigenvalues
# n_components  <- length(Eigs) # number of components
# 
# paral <- parallel(subject = n_i,  # The number of cases in our data
#                   var = n_p,  # The number of variables in our data
#                   rep = 1000,
#                   quantile = .95,
#                    model  = "components")
# 
# ParallelAna <- data.frame(Ncomponent  = 1:n_components,
#                            Eigs,
#                            RandEigM = paral$eigen$mevpea,
#                            RandEig95= paral$eigen$qevpea)
# 
# ParallelAna <- round(ParallelAna, 3)
# ParallelAna
# exceeder <- ParallelAna[ParallelAna[, "RandEig95"] > ParallelAna[, "Eigs"], ][1,]
# exceeder

Retaining 3 clusters

efa_2023 %>%
  model_parameters(sort = TRUE, threshold = "max") %>%
  write_tsv(here("output", "EFA 2023 Results Max.txt"), na = "") %>%
  datatable(caption = "Max loadings") %>%
  formatRound(digits = 2, columns = 2:8) 
efa_2023 %>%
  model_parameters(sort = TRUE, threshold = 0.28) %>%
  write_tsv(here("output", "EFA 2023 Results Threshold.txt"), na = "") %>%
  datatable(caption = "Threshold loadings") %>%
  formatRound(digits = 2, columns = 2:8) 
efa_2023 %>%
  model_parameters(sort = TRUE) %>%
  write_tsv(here("output", "EFA 2023 Results All.txt"), na = "") %>%
  datatable(caption = "All loadings") %>%
  formatRound(digits = 2, columns = 2:8) 
# Warning message included below, just in case:
# DataTables warning: table id=DataTables_Table_0 - Requested unknown parameter '7' for row 0, column 7. For more information about this error, please see http://datatables.net/tn/4